date()
## [1] "Mon Nov 27 16:02:38 2023"
R-Script: https://github.com/venkatharina/IODS-project/blob/Data/create_human.R
library(MASS)
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
#loading the data
data("Boston")
#exploring the dataset
dim(Boston) #504 rows and 14 variables
## [1] 506 14
str(Boston) #numeric and integrin vectors
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#Data describes methodological problems associated with the use of housing market data to measure the willingness to pay for clean air
#With the use of a hedonic housing price model and data for the Boston metropolitan area, quantitative estimates of the willingness to pay for air quality improvements are generated
#Marginal air pollution damages (as revealed in the housing market) are found to increase with the level of air pollution and with household income
#graphical overview and summary of the data:
plot(Boston)
pairs(Boston)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
cor_matrix <- cor(Boston)
corrplot(cor_matrix, method="circle")
#lots of variables, some seem to not correlate, some seem to correlate negative/positive to each others
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
summary(boston_scaled)
## crim zn indus chas
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563 Min. :-0.2723
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668 1st Qu.:-0.2723
## Median :-0.390280 Median :-0.48724 Median :-0.2109 Median :-0.2723
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150 3rd Qu.:-0.2723
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202 Max. : 3.6648
## nox rm age dis
## Min. :-1.4644 Min. :-3.8764 Min. :-2.3331 Min. :-1.2658
## 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366 1st Qu.:-0.8049
## Median :-0.1441 Median :-0.1084 Median : 0.3171 Median :-0.2790
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059 3rd Qu.: 0.6617
## Max. : 2.7296 Max. : 3.5515 Max. : 1.1164 Max. : 3.9566
## rad tax ptratio black
## Min. :-0.9819 Min. :-1.3127 Min. :-2.7047 Min. :-3.9033
## 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876 1st Qu.: 0.2049
## Median :-0.5225 Median :-0.4642 Median : 0.2746 Median : 0.3808
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058 3rd Qu.: 0.4332
## Max. : 1.6596 Max. : 1.7964 Max. : 1.6372 Max. : 0.4406
## lstat medv
## Min. :-1.5296 Min. :-1.9063
## 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 3.5453 Max. : 2.9865
#all the data was scaled to zero (mean to zero)
#`crim` (per capita crime rate by town)
#cut the variable by to get the high, low and middle rates of crime into their own categories
boston_scaled$crim <- as.numeric(boston_scaled$crim)
#using the quantiles as the break points (bins)
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
#creating categorical variable
crime <- cut(boston_scaled$crim, breaks =bins, include.lowest = TRUE)
#removing original crim from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
#adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
#creating testing and training data:
n <- nrow(Boston)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
#doing linear discriminant analysis
lda.fit = lda(crime ~ ., data=train)
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## [-0.419,-0.411] (-0.411,-0.39] (-0.39,0.00739] (0.00739,9.92]
## 0.2500000 0.2574257 0.2500000 0.2425743
##
## Group means:
## zn indus chas nox rm
## [-0.419,-0.411] 1.0493320 -0.9304643 -0.155385496 -0.8878773 0.4706016
## (-0.411,-0.39] -0.1353591 -0.2437711 -0.007331936 -0.5730003 -0.1256699
## (-0.39,0.00739] -0.3929954 0.1182795 0.234426408 0.3132186 0.1371383
## (0.00739,9.92] -0.4872402 1.0171960 -0.071456607 1.0713166 -0.4152575
## age dis rad tax ptratio
## [-0.419,-0.411] -0.8993787 0.8765764 -0.6953230 -0.7140039 -0.476580685
## (-0.411,-0.39] -0.3625265 0.3243112 -0.5500918 -0.4747678 0.008103357
## (-0.39,0.00739] 0.3891035 -0.3526371 -0.3928554 -0.3092396 -0.239225343
## (0.00739,9.92] 0.8330926 -0.8630831 1.6373367 1.5134896 0.779855168
## black lstat medv
## [-0.419,-0.411] 0.37068690 -0.78199177 0.55181657
## (-0.411,-0.39] 0.32217992 -0.14811224 -0.01590369
## (-0.39,0.00739] 0.09678494 -0.03410673 0.19494574
## (0.00739,9.92] -0.85587610 0.86524580 -0.66349081
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.07071560 0.86551072 -0.76408303
## indus 0.02786994 -0.39061897 0.53486577
## chas -0.07947947 -0.13497302 0.05565763
## nox 0.45256883 -0.48908322 -1.40606522
## rm -0.12443158 -0.02424534 -0.07767991
## age 0.23821036 -0.29926131 -0.33000640
## dis -0.04982303 -0.32512804 0.16212076
## rad 3.20832668 0.67264314 0.29189832
## tax -0.09653873 0.25276207 0.18181895
## ptratio 0.10087302 0.05322751 -0.14142084
## black -0.13245317 -0.01033133 0.13599966
## lstat 0.18027739 -0.19713506 0.37187585
## medv 0.15809684 -0.28663775 -0.24576072
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9463 0.0389 0.0148
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
graphics::arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
#plotting lda results biplot)
plot(lda.fit, dimen = 2)
lda.arrows(lda.fit, myscale = 1)
#saving the crime categories from the test set
correct_classes <- test$crime
#then removing the categorical crime variable from the test dataset
test <- dplyr::select(test, -crime)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct [-0.419,-0.411] (-0.411,-0.39] (-0.39,0.00739] (0.00739,9.92]
## [-0.419,-0.411] 12 14 0 0
## (-0.411,-0.39] 3 14 5 0
## (-0.39,0.00739] 0 5 19 1
## (0.00739,9.92] 0 0 0 29
#predicted values seem correct only in class 4 (0.00739,9.92)
#mostly predictions seem okay-ish (majority in right class), but it could be better
#reloading the data
data("Boston")
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
summary(boston_scaled$dis)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.2658 -0.8049 -0.2790 0.0000 0.6617 3.9566
#calculating distances between the observations
distance <- dist(boston_scaled)
summary(distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
#trying to identify right cluster number
set.seed(140)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#right cluster number seems to be 6 for k-means clustering
km <- kmeans(boston_scaled, centers = 6)
# plot the dataset with clusters
pairs(boston_scaled, col = km$cluster)
############ BONUS ############
#reloading data:
data("Boston")
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
#k-clustering
km3 <- kmeans(boston_scaled, centers = 8)
boston_scaled$kmcluster <- as.numeric(km3$cluster)
#making lda modeling
lda.fit2 = lda(kmcluster~., data=boston_scaled)
lda.fit2
## Call:
## lda(kmcluster ~ ., data = boston_scaled)
##
## Prior probabilities of groups:
## 1 2 3 4 5 6 7
## 0.21343874 0.04150198 0.21541502 0.06521739 0.10869565 0.15415020 0.10276680
## 8
## 0.09881423
##
## Group means:
## crim zn indus chas nox rm
## 1 -0.3768106 -0.425505051 -0.3701112 0.09221725 -0.2286148 -0.35000772
## 2 3.2082952 -0.487240187 1.0149946 -0.27232907 1.0998477 -1.45566793
## 3 -0.4060502 -0.005364112 -0.6583486 -0.27232907 -0.8405622 -0.04480167
## 4 1.1193656 -0.487240187 1.0149946 -0.27232907 0.9950574 -0.19450805
## 5 -0.4146327 2.524683754 -1.2040537 -0.20074543 -1.2034679 0.73835915
## 6 0.4620310 -0.487240187 1.0149946 0.13147608 1.0021383 -0.15800782
## 7 -0.2727474 -0.487240187 1.5613334 0.10623826 1.1631724 -0.63230595
## 8 -0.3681800 -0.053323566 -0.7442735 0.59383298 -0.2416605 1.68533550
## age dis rad tax ptratio black
## 1 0.3376923 -0.1286995 -0.5799077 -0.53244743 0.22283672 0.2729966
## 2 1.0064301 -1.0710604 1.6596029 1.52941294 0.80577843 -0.1691195
## 3 -1.0244892 0.8700429 -0.5720054 -0.70802741 -0.07989337 0.3722489
## 4 0.7406815 -0.8461740 1.6596029 1.52941294 0.80577843 -3.2850509
## 5 -1.4205771 1.6272606 -0.6832698 -0.53843478 -0.89403340 0.3540831
## 6 0.6920432 -0.7470446 1.6596029 1.52941294 0.80577843 0.1640227
## 7 0.9324774 -0.9083143 -0.6130366 0.03111252 -0.35520300 -0.1409865
## 8 0.1056916 -0.2903328 -0.4926242 -0.78414273 -1.08156698 0.3392477
## lstat medv
## 1 0.1641254 -0.25817616
## 2 2.0102765 -1.39479170
## 3 -0.5625097 0.09778119
## 4 1.1195132 -1.03254705
## 5 -0.9678176 0.83523455
## 6 0.3945960 -0.31539874
## 7 0.6799267 -0.51668841
## 8 -0.9695286 1.72241105
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4 LD5
## crim 0.19364384 0.126100236 -0.20191652 -0.376580103 -0.838531764
## zn -0.05343615 1.094813039 1.50799254 -1.133131460 0.197678623
## indus 0.55399704 -1.280702654 1.13241037 -1.197258173 0.124603383
## chas -0.03302284 -0.024772810 -0.14210875 0.080434989 0.122406528
## nox 0.05435103 -0.424948658 0.26397352 -0.771840326 0.285880375
## rm -0.04813200 0.181941956 0.09971698 0.277384152 0.689446855
## age 0.14103637 -0.651476207 -0.09019746 0.008315478 0.502754473
## dis -0.33219936 0.286780273 0.03476554 -0.281273725 -0.252480041
## rad 5.93891512 1.865585385 -1.41594980 0.744695993 0.427251550
## tax -0.05791010 0.519204288 0.49438996 -0.580198607 0.087932891
## ptratio 0.21221433 -0.011384841 0.01954111 0.048612579 -0.116143664
## black -0.19991944 0.269843607 -1.52326403 -1.543950260 -0.002263929
## lstat 0.07512642 -0.006494829 0.08239517 0.043407190 -0.209298121
## medv -0.08095390 0.223086304 -0.03618324 -0.137450228 0.487212102
## LD6 LD7
## crim -1.055793974 0.76410951
## zn -0.472879553 -0.75684606
## indus 0.908040391 1.05671771
## chas -0.139637868 -0.18984158
## nox 0.186010320 0.32881295
## rm -0.077931268 0.31058189
## age -0.573790467 -0.91403286
## dis 0.721069945 0.66003474
## rad 0.851480550 0.28819620
## tax -0.245047274 -0.77133610
## ptratio -0.008931086 -0.50612180
## black 0.001693988 -0.02201946
## lstat -0.741261471 0.23456713
## medv -0.594939439 0.54523880
##
## Proportion of trace:
## LD1 LD2 LD3 LD4 LD5 LD6 LD7
## 0.7632 0.1051 0.0506 0.0403 0.0205 0.0141 0.0062
#plotting
plot(lda.fit2, dimen = 2)
lda.arrows(lda.fit2, myscale = 1)
############ SUPERBONUS ############
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
#Next, install and access the plotly package. Create a 3D plot (cool!) of the columns of the matrix product using the code below.
#install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers')
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color= 'train$crime', colors=c('red','green','blue','violet'))